/*
Replication Stata do file for "Automated Content Analysis Across Six Languages"
Windsor, Cupit, Windsor 2019

Needs associated files:

lsa.dta
multi-un.dta

*/

set more off

capture: program drop nstrpos

program define nstrpos, rclass
	args searchin searchstr num
	scalar len = length("`searchstr'")
	local repstr ""
	forvalues i = 1/`=len' {
		local repstr = "`repstr'" + char(int(32+95*runiform()))
	}
	local searchin = subinstr("`searchin'","`searchstr'","`repstr'",`=`num'-1')
	return scalar pos = strpos("`searchin'", "`searchstr'")
end

use multi-un.dta

unab liwcvars : wc - otherp
unab liwcvars_pct : sixltr - otherp
unab liwcvars_indep: i - they ipron article - quant posemo anx anger sad family - male insight - differ see-feel body-ingest affiliation - focusfuture motion ///
  - death swear - filler period-otherp
  

// Unique Identifier

gen uid = substr(filename,1,32)

merge m:1 uid using "lsa.dta", nogen

drop if LSA < 0.4

drop if wc < 15 | wc == .

drop if wc > 230




// Drop those without data for all languages

bysort uid :gen incomplete = _N < 6

tab incomplete

drop if incomplete

// "Original" Language

gen orig = substr(filename,34,2)

foreach var of local  liwcvars {
	bysort orig: egen `var'_sd_ = sd(`var')
	}

drop filename

gen orig_n = 1 if orig == "en"
replace orig_n = 2 if orig == "ar"
replace orig_n = 3 if orig == "de"
replace orig_n = 4 if orig == "fr"
replace orig_n = 5 if orig == "ru"
replace orig_n = 6 if orig == "zh"

foreach var of local  liwcvars {
	display "For Variable `var'"
	tabstat `var', s(mean sd) by(orig)
	}


foreach var of local  liwcvars {
	display "For Variable `var'"
	oneway `var' orig_n 
	}


foreach var of varlist  wc - otherp {
	rename `var' `var'_
	
	}

drop orig_n

reshape wide wc_ - otherp_ wc_sd - otherp_sd  , i(uid) j(orig) string

foreach var of local liwcvars {
	foreach lang in en ar de fr ru zh {
		local orderlist `orderlist' `var'_`lang'
		}	
	}
	
order `orderlist'	

// Word counts may be stable when proportions are not. 

foreach lang in en ar de fr ru zh {
	foreach var of local liwcvars_pct {
		gen `var'_`lang'_wc = round(`var'_`lang' * wc_`lang'/100)
		}
	}

// Generate Difference Variable

foreach lang in ar de fr ru zh {
	foreach var of local liwcvars {
		gen `var'_`lang'_d = `var'_`lang' - `var'_en
		}
	}
	
foreach lang in ar de fr ru zh {
	foreach var of local liwcvars_pct {
		gen `var'_`lang'_wc_d = `var'_`lang'_wc - `var'_en_wc
		}
	}	
	
foreach lang in ar de fr ru zh {
	foreach var of local liwcvars {
		gen `var'_`lang'_pct = `var'_`lang'_d/`var'_en
		}
	}	

sum affect_en, meanonly
hist affect_en, xtitle("Percentage of Words") ytitle("Density") start(0) title("Histogram of Affect in English") ///
  graphregion(fcolor(none)) plotregion(fcolor(none)) bgcolor(white) ///
  xline(`=r(mean)')
graph export "Affect.pdf", replace 
sum affect_fr_d, meanonly
twoway kdensity affect_fr_d, xtitle("Percentage of Words") ytitle("Density") title("Histogram of Difference Affect translated from French") ///
  graphregion(fcolor(none)) plotregion(fcolor(none)) bgcolor(white) ///
  xline(`=r(mean)')
graph export "Affect-d.pdf", replace

// Baseline t tests
putexcel set "tTestTable.xlsx", replace
putexcel B1=("Language")
putexcel A2=("LIWC Variable")
putexcel B2=("Arabic") C2=("German") D2=("French")
putexcel E2=("Russian") F2=("Mandarin")
local row = 3
foreach var of local liwcvars {
	putexcel A`row'=("`var'")
	local col = 2
	foreach lang in ar de fr ru zh {
		excelcol `col'
		local colname `r(column)'
		qui: ttest `var'_`lang'_d ==0
		putexcel `colname'`row' = (`=r(p)')
		local ++col
		}
	local ++row
	}

// Baseline t test for counts
putexcel set "tTestTableCounts.xlsx", replace
putexcel B1=("Language")
putexcel A2=("LIWC Variable")
putexcel B2=("Arabic") C2=("German") D2=("French")
putexcel E2=("Russian") F2=("Mandarin")
local row = 3
foreach var of local liwcvars_pct {
	putexcel A`row'=("`var'")
	local col = 2
	foreach lang in ar de fr ru zh {
		excelcol `col'
		local colname `r(column)'
		qui: ttest `var'_`lang'_wc_d ==0
		putexcel `colname'`row' = (`=r(p)')
		local ++col
		}
	local ++row
	}	

// Cohen's d-Effect Sizes
putexcel set "EffectSize2.xlsx", replace
putexcel B1=("Language")
putexcel A2=("LIWC Variable")
putexcel B2=("Arabic") C2=("German") D2=("French")
putexcel E2=("Russian") F2=("Mandarin")
local row = 3
foreach var of local liwcvars {
	putexcel A`row'=("`var'")
	local col = 2
	foreach lang in ar de fr ru zh {
		excelcol `col'
		local colname `r(column)'
		sum `var'_`lang'_d
		di `var'_`lang'_d/`var'_sd_en
		putexcel `colname'`row' = (r(mean)/`var'_sd_en[1])
		local ++col
		}
	local ++row
	}


putexcel A`row'=("Overall")
local col = 2
foreach lang in ar de fr ru zh {
	scalar `lang'_overall_d = 0
	scalar `lang'_overall_var = 0
	foreach var of local liwcvars_indep {
		sum `var'_`lang'_d
		scalar `lang'_overall_d = `lang'_overall_d + r(mean)
		scalar `lang'_overall_var = `lang'_overall_var + (`var'_sd_en)^2
		}
	excelcol `col'
	local colname `r(column)'
	putexcel `colname'`row' = (`lang'_overall_d /sqrt(`lang'_overall_var))
	local ++col
}
	
		

// confidence intervals 
putexcel set "confidenceIntervals_WC.xlsx", replace
putexcel B1=("Language")
putexcel A2=("LIWC Variable")
putexcel B2=("English Mean")
local col = 3
foreach lang in Arabic German French Russian Chinese {
	excelcol `col'
	local colname `r(column)'
	putexcel `colname'2 = ("`lang'")
	local col = `col'+2
}
local row = 3
scalar sampleSize = `=_N'
scalar t= invttail(sampleSize-1,0.025)
foreach var of local liwcvars_pct {
	qui: putexcel A`row'=("`var'")
	sum `var'_en_wc , meanonly
	qui:putexcel B`row' = (r(mean))
	local col = 3
	foreach lang in ar de fr ru zh {
		excelcol `col'
		local colname `r(column)'
		qui: sum `var'_`lang'_wc_d
		scalar diff = r(mean)
		scalar standardError = r(sd)/sqrt(sampleSize)
		qui: putexcel `colname'`row' = (diff-t*standardError)
		local ++col
		excelcol `col'
		local colname `r(column)'
		qui: putexcel `colname'`row' = (diff+t*standardError)
		local ++col
		}
	local ++row
	}
	
// confidence intervals 
putexcel set "confidenceIntervals.xlsx", replace
putexcel B1=("Language")
putexcel A2=("LIWC Variable")
putexcel B2=("English Mean")
local col = 3
foreach lang in Arabic German French Russian Chinese {
	excelcol `col'
	local colname `r(column)'
	putexcel `colname'2 = ("`lang'")
	local col = `col'+2
}
local row = 3
scalar sampleSize = `=_N'
scalar t= invttail(sampleSize-1,0.025)
foreach var of local liwcvars {
	qui: putexcel A`row'=("`var'")
	sum `var'_en , meanonly
	scalar mean_en = r(mean)
	qui:putexcel B`row' = (mean_en)
	local col = 3
	foreach lang in ar de fr ru zh {
		excelcol `col'
		local colname `r(column)'
		qui: sum `var'_`lang'_d
		scalar diff = r(mean)
		scalar standardError = r(sd)/sqrt(sampleSize)
		qui: putexcel `colname'`row' = ((diff-t*standardError))
		local ++col
		excelcol `col'
		local colname `r(column)'
		qui: putexcel `colname'`row' = ((diff+t*standardError))
		local ++col
		}
	local ++row
	}
	
	// confidence intervals 
putexcel set "confidenceIntervalsPct.xlsx", replace
putexcel B1=("Language")
putexcel A2=("LIWC Variable")
putexcel B2=("English Mean")
local col = 3
foreach lang in Arabic German French Russian Chinese {
	excelcol `col'
	local colname `r(column)'
	putexcel `colname'2 = ("`lang'")
	local col = `col'+2
}
local row = 3
scalar sampleSize = `=_N'
scalar t= invttail(sampleSize-1,0.025)
foreach var of local liwcvars {
	qui: putexcel A`row'=("`var'")
	sum `var'_en , meanonly
	scalar mean_en = r(mean)
	qui:putexcel B`row' = (mean_en)
	local col = 3
	foreach lang in ar de fr ru zh {
		excelcol `col'
		local colname `r(column)'
		qui: sum `var'_`lang'_d
		scalar diff = r(mean)
		scalar standardError = r(sd)/sqrt(sampleSize)
		qui: putexcel `colname'`row' = ((diff-t*standardError)/mean_en)
		local ++col
		excelcol `col'
		local colname `r(column)'
		qui: putexcel `colname'`row' = ((diff+t*standardError)/mean_en)
		local ++col
		}
	local ++row
	}

// Correlations 
putexcel set "Correlations.xlsx", replace
putexcel B1=("Language")
putexcel A2=("LIWC Variable")
putexcel B2=("Arabic") C2=("German") D2=("French")
putexcel E2=("Russian") F2=("Mandarin")
local row = 3
foreach var of local liwcvars {
	putexcel A`row'=("`var'")
	local col = 2
	foreach lang in ar de fr ru zh {
		excelcol `col'
		local colname `r(column)'
		correlate `var'_en `var'_`lang' 
		mat corrMat = r(C)
		scalar corr = corrMat[1,2]
		putexcel `colname'`row' = (corr)
		local ++col
		}
	local ++row
	}

// Correlations 
putexcel set "Correlations_WC.xlsx", replace
putexcel B1=("Language")
putexcel A2=("LIWC Variable")
putexcel B2=("Arabic") C2=("German") D2=("French")
putexcel E2=("Russian") F2=("Mandarin")
local row = 3
foreach var of local liwcvars_pct {
	putexcel A`row'=("`var'")
	local col = 2
	foreach lang in ar de fr ru zh {
		excelcol `col'
		local colname `r(column)'
		correlate `var'_en_wc `var'_`lang'_wc 
		mat corrMat = r(C)
		scalar corr = corrMat[1,2]
		putexcel `colname'`row' = (corr)
		local ++col
		}
	local ++row
	}
	
// Absolute Sizes
putexcel set "Absolute.xlsx", replace
putexcel C1=("Change from English")
putexcel A2=("LIWC Variable")
putexcel C2=("Arabic") D2=("German") E2=("French")
putexcel F2=("Russian") G2=("Mandarin") B2=("English")
local row = 3
foreach var of local liwcvars {
	putexcel A`row'=("`var'")
	local col = 3
	foreach lang in ar de fr ru zh {
		excelcol `col'
		local colname `r(column)'
		qui:sum `var'_`lang'_d, meanonly
		qui: putexcel `colname'`row' = (r(mean))
		local ++col
		}
	local ++row
	}
local row = 3
foreach var of local liwcvars {
	local col = 2
	excelcol `col'
	local colname `r(column)'
	qui: sum `var'_en, meanonly
	qui: putexcel `colname'`row' = (r(mean))
	local ++row
	}

// Percent Change 
putexcel set "Percent Change.xlsx", replace
putexcel C1=("Mean Percentage Change from English")
putexcel A2=("LIWC Variable")
putexcel C2=("Arabic") D2=("German") E2=("French")
putexcel F2=("Russian") G2=("Mandarin") B2=("English")
local row = 3
foreach var of local liwcvars {
	putexcel A`row'=("`var'")
	local col = 3
	foreach lang in ar de fr ru zh {
		excelcol `col'
		local colname `r(column)'
		qui:sum `var'_`lang'_pct, meanonly
		qui: putexcel `colname'`row' = (r(mean))
		local ++col
		}
	local ++row
	}
local row = 3
foreach var of local liwcvars {
	local col = 2
	excelcol `col'
	local colname `r(column)'
	qui: sum `var'_en, meanonly
	qui: putexcel `colname'`row' = (r(mean))
	local ++row
	}



// Summary
putexcel set "Summary.xlsx", replace
putexcel B1=("Language")
putexcel A3=("LIWC Variable")
putexcel D2=("Arabic") F2=("German") H2=("French")
putexcel J2=("Russian") L2=("Mandarin") B2=("English")
local col = 2
	foreach lang in en ar de fr ru zh {
		excelcol `col'
		local colname `r(column)'
		putexcel `colname'3 = ("Mean")
		local ++col
		excelcol `col'
		local colname `r(column)'
		putexcel `colname'3 = ("Std. Dev.")
		local ++col
		}
local row = 4
foreach var of local liwcvars {
	putexcel A`row'=("`var'")
	local col = 2
	foreach lang in en ar de fr ru zh {
		excelcol `col'
		local colname `r(column)'
		sum `var'_`lang'
		putexcel `colname'`row' = (r(mean))
		local ++col
		excelcol `col'
		local colname `r(column)'
		sum `var'_`lang'
		putexcel `colname'`row' = (r(sd))
		local ++col		
		}
	local ++row
	}
